#!/usr/bin/perl
#
# uniq-maildirs [-m grepfile] mdir1 mdir2 ...
#
# Run through all maildirs specified on the command line, searching
# recursively (and through subdirs) reading each file for a Message-I[dD]
# header.  Files that share the same Message-Id will be listed on stdout
# in sh command format: "rm -f FILENAME" so the output can be run
# as a script.
#
# If the -m arg is used, the file named will be parsed for Message-IDs
# in UNIX "grep" format: e.g. "filename:Message-Id: blah"

use File::Find;

$| = 1;

if (defined $ARGV[0] && $ARGV[0] eq '-m') {
  shift @ARGV;
  my $msgidsfile = shift @ARGV;
  %file = ();
  $count = 0;
  open (IN, "<$msgidsfile") or die "cannot read msgids from $msgidsfile";
  while (<IN>) {
    /^([^:]+):Message-I.: (\S+)/ or next;
    $f = $1;
    $m = $2; $m =~ s/^<//; $m =~ s/>$//;

    $file{$m} = $f;
    $count++;

    progress ($count, "m");
  }
  warn "\nfound $count message-ids.\n";
}

$count = 0;
$dups = 0;
foreach my $dir (@ARGV) {
  File::Find::find ( { wanted => \&wanted, no_chdir => 1 }, $dir);
}
warn "\nscanned $count mails, $dups dups.\n";

sub wanted {
  local ($_);
  $count++;

  open (IN, "<$File::Find::name") or warn "cannot read $File::Find::name";
  while (<IN>) {
    /^$/ and last;
    /^Message-I[dD]: (\S+)/ or next;

    my $m = $1;
    $m =~ s/^<//; $m =~ s/>$//;
    if (exists $file{$m}) {
      print "\n# DUP: $File::Find::name dup of $file{$m}\n";
      print "rm -f $File::Find::name\n";
      $dups++;
      progress ($count, "*");
    } else {
      $file{$m} = $File::Find::name;
      progress ($count, ".");
    }
    last;
  }
  close IN;
}

sub progress {
  my ($c, $sym) = @_;

  if (($c % 500) == 0) { print STDERR $sym; }
  if (($c % (500*70)) == 0) { print STDERR "\n"; }
}
